Load the Iris dataset and libraries

library(tidyverse)  # data manipulation
library(cluster)    # clustering algorithms
library(factoextra) # clustering algorithms & visualization
library(dendextend) # nice dendrograms 
library(pheatmap) # nice heatmaps
library(caret) #another option for pca
data("iris")
str(iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

Explore the iris dataset

Challenge

Take a few moments to explore the Iris dataset. What can you learn? Which species do you think will be easier to separate?

{: .source}

Solution


dim(iris)
str(iris)

iris %>%
  ggplot(aes(x = Sepal.Length, y = Sepal.Width, col = Species)) + geom_point() + theme_minimal()

iris %>%
  ggplot(aes(x = Sepal.Length, y = Sepal.Width, col = Species)) + geom_point() + theme_minimal()

iris %>%
  ggplot(aes(x = Petal.Width, y = Petal.Length, col = Species)) + geom_point() + theme_minimal()

iris %>% 
  gather(key, value, -Species) %>%
  ggplot(aes(y = value, fill = Species)) + geom_boxplot() + facet_wrap(.~key) + theme_bw()

{: .output} {: .solution} {: .challenge}

What if we didn’t know we had 3 species??? Could we use the morphological data to study this problem?

Clustering

k-means clustering

iris_scaled <- scale(iris[,1:4])
rownames(iris_scaled) <- paste(substr(iris$Species, 1, 2), seq(1:length(iris$Species)), sep = "_")
distance <- get_dist(iris_scaled)
fviz_dist(distance)

k2 <- kmeans(iris_scaled, centers = 2, nstart = 25)
str(k2)
## List of 9
##  $ cluster     : Named int [1:150] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "names")= chr [1:150] "se_1" "se_2" "se_3" "se_4" ...
##  $ centers     : num [1:2, 1:4] -1.011 0.506 0.85 -0.425 -1.301 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "1" "2"
##   .. ..$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
##  $ totss       : num 596
##  $ withinss    : num [1:2] 47.4 173.5
##  $ tot.withinss: num 221
##  $ betweenss   : num 375
##  $ size        : int [1:2] 50 100
##  $ iter        : int 1
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"
fviz_cluster(k2, data = iris_scaled) + theme_minimal()

set.seed(42)
fviz_nbclust(iris_scaled, kmeans, method = "wss")

fviz_nbclust(iris_scaled, kmeans, method = "silhouette")

fviz_nbclust(iris_scaled, kmeans, method = "gap_stat")

k3 <- kmeans(iris_scaled, centers = 3, nstart = 25)
str(k3)
## List of 9
##  $ cluster     : Named int [1:150] 2 2 2 2 2 2 2 2 2 2 ...
##   ..- attr(*, "names")= chr [1:150] "se_1" "se_2" "se_3" "se_4" ...
##  $ centers     : num [1:3, 1:4] 1.1322 -1.0112 -0.0501 0.0881 0.8504 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:3] "1" "2" "3"
##   .. ..$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
##  $ totss       : num 596
##  $ withinss    : num [1:3] 47.5 47.4 44.1
##  $ tot.withinss: num 139
##  $ betweenss   : num 457
##  $ size        : int [1:3] 47 50 53
##  $ iter        : int 2
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"
fviz_cluster(k3, data = iris_scaled) + theme_bw()

k7 <- kmeans(iris_scaled, centers = 7, nstart = 25)
str(k7)
## List of 9
##  $ cluster     : Named int [1:150] 2 1 1 1 2 2 1 1 1 1 ...
##   ..- attr(*, "names")= chr [1:150] "se_1" "se_2" "se_3" "se_4" ...
##  $ centers     : num [1:7, 1:4] -1.303 -0.719 0.289 0.954 0.36 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:7] "1" "2" "3" "4" ...
##   .. ..$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
##  $ totss       : num 596
##  $ withinss    : num [1:7] 9.65 12.15 11.83 7.55 5.94 ...
##  $ tot.withinss: num 71.1
##  $ betweenss   : num 525
##  $ size        : int [1:7] 25 25 29 21 17 21 12
##  $ iter        : int 3
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"
fviz_cluster(k7, data = iris_scaled) + theme_bw()

Challenge

Choose whichever clustering approach you think worked best among the above. If you partition the data this way, which of the variables is most distinct in the clusters?

{: .source}

Solution


iris %>%
  mutate(Cluster = k3$cluster) %>%
  group_by(Cluster) %>%
  summarize (MostFreqSpecies =names(which.max(table(Species))),
             Sepal.Length = mean(Sepal.Length),
             Sepal.Width = mean(Sepal.Width),
             Petal.Width = mean(Petal.Width),
             Petal.Length = mean(Petal.Length))

{: .output} {: .solution} {: .challenge}

Hierarchical clustering

The first step is to compute the distance between each sample, and by default, the complete linkage method is used.

iris_hcl <- hclust(dist(iris_scaled))
plot(iris_hcl)

# cut dendrogram in 3 clusters
dendcut = cutree(iris_hcl, 3)
table(dendcut, iris$Species)
##        
## dendcut setosa versicolor virginica
##       1     49          0         0
##       2      1         21         2
##       3      0         29        48
iris_hcl %>% as.dendrogram() %>% plot()

fviz_dend(hcut(iris_scaled, k = 3, hc_method = "complete"), 
          rect = TRUE, 
          cex = 0.5, 
          palette = "Set1"
          )

collabels <- data.frame(species = substr(rownames(iris_scaled), 1, 2))
row.names(collabels) <- rownames(iris_scaled)
pheatmap(iris_scaled, 
         cluster_rows = iris_hcl, 
         treeheight_row = 30, 
         treeheight_col = 30,
         annotation_row = collabels)

Challenge

Try constructing a heatmap using another agglomeration method, and visualise the results.

Do you think your approach is better or worse than the “default”? Compare with your group…

{: .source}

Solution

iris_hcl <- hclust(dist(iris_scaled), method = "ward.D")
pheatmap(iris_scaled, 
    cluster_rows = iris_hcl, 
    treeheight_row = 30, 
    treeheight_col = 30,
    annotation_row = collabels)

{: .output} {: .solution} {: .challenge}

iris_pca <- prcomp(iris_scaled)
iris_scores = as.data.frame(iris_pca$x)
iris_scores$species <- substr(row.names(iris_scores), 1, 2)



# plot of observations
ggplot(data = iris_scores, aes(x = PC1, y = PC2, color = species, label = species)) +
  geom_text(alpha = 0.8, size = 4) +
  ggtitle("First two PC of Iris data") +
  theme_minimal()

fviz_eig(iris_pca, addlabels = TRUE)

# plot of observations
ggplot(data = iris_scores, aes(x = PC2, y = PC3, color = species, label = species)) +
  geom_text(alpha = 0.8, size = 4) +
  ggtitle("PC2/3 of Iris data") +
  theme_minimal()

Let’s look at the rotation matrix:

iris_pca$rotation
##                     PC1         PC2        PC3        PC4
## Sepal.Length  0.5210659 -0.37741762  0.7195664  0.2612863
## Sepal.Width  -0.2693474 -0.92329566 -0.2443818 -0.1235096
## Petal.Length  0.5804131 -0.02449161 -0.1421264 -0.8014492
## Petal.Width   0.5648565 -0.06694199 -0.6342727  0.5235971
fviz_pca_var(iris_pca,  
             col.var = "cos2",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"))

# Contributions of variables to PC1
fviz_contrib(iris_pca, choice = "var", axes = 1, top = 10)

# Contributions of variables to PC2
fviz_contrib(iris_pca, choice = "var", axes = 2, top = 10)

# Visualize
# Use habillage to specify groups for coloring
fviz_pca_ind(iris_pca,
             label = "none", 
             habillage = iris$Species, 
             palette = "Set1",
             addEllipses = TRUE 
             )

iris_trans <- preProcess(iris[,1:4], method=c("BoxCox", "center",  "scale", "pca"))
iris_PCaret <- predict(iris_trans, iris[,1:4])
iris_PCaret # kept only the PCs that are necessary >= 95% of variability in the data
##             PC1          PC2
## 1   -2.30353961 -0.474826012
## 2   -2.15130962  0.648290323
## 3   -2.46134124  0.346392126
## 4   -2.41396812  0.606683881
## 5   -2.43277736 -0.611005666
## 6   -1.97917155 -1.395059430
## 7   -2.50115378 -0.006518402
## 8   -2.28638088 -0.229199408
## 9   -2.48383388  1.159498745
## 10  -2.33581242  0.443987500
## 11  -2.17167287 -1.011051847
## 12  -2.40408015 -0.120269521
## 13  -2.38192257  0.714883104
## 14  -2.88538841  1.024170864
## 15  -2.16605616 -1.737980893
## 16  -2.09777970 -2.391303176
## 17  -2.11003673 -1.383989325
## 18  -2.17836253 -0.483061847
## 19  -1.83599672 -1.355959687
## 20  -2.31125904 -1.046088311
## 21  -1.93737228 -0.445315773
## 22  -2.14789585 -0.870075742
## 23  -2.87087622 -0.371786247
## 24  -1.75630224 -0.112582094
## 25  -2.30593126 -0.128572099
## 26  -2.01143673  0.587476488
## 27  -2.01927769 -0.247388061
## 28  -2.19927409 -0.530725435
## 29  -2.17458180 -0.333747858
## 30  -2.36319236  0.338089547
## 31  -2.22443371  0.487464454
## 32  -1.76841797 -0.455201848
## 33  -2.67622119 -1.610906247
## 34  -2.39807281 -1.942435053
## 35  -2.18117474  0.433813350
## 36  -2.26606711  0.179855760
## 37  -2.05803552 -0.678662754
## 38  -2.66185534 -0.545552734
## 39  -2.58060936  0.945557589
## 40  -2.21341474 -0.283383488
## 41  -2.28404497 -0.426110240
## 42  -1.88759440  2.516670631
## 43  -2.70437456  0.526866756
## 44  -1.88695330 -0.454081303
## 45  -2.07118404 -1.064343709
## 46  -2.10210781  0.696473119
## 47  -2.40371982 -1.040620003
## 48  -2.50786821  0.402470021
## 49  -2.24054721 -0.959906308
## 50  -2.26049417 -0.028181150
## 51   1.12507182 -0.903871861
## 52   0.79118659 -0.657201417
## 53   1.26040902 -0.667116963
## 54   0.55272346  1.839079983
## 55   1.13447975  0.155012571
## 56   0.49256530  0.525384382
## 57   0.79884457 -0.826102580
## 58  -0.38054459  1.933801527
## 59   0.99984229 -0.100525205
## 60   0.08880332  1.016640487
## 61   0.07447330  2.939105790
## 62   0.51707110 -0.007629525
## 63   0.74725565  1.876352767
## 64   0.80580689  0.108088184
## 65   0.06727687  0.376717161
## 66   0.92670853 -0.568734868
## 67   0.42293272  0.126858901
## 68   0.28955671  0.729883259
## 69   1.36043786  1.751163755
## 70   0.30507192  1.300403803
## 71   0.76560630 -0.454505340
## 72   0.57888769  0.353645374
## 73   1.32475341  0.933440428
## 74   0.74238439  0.338583304
## 75   0.78831001 -0.008025008
## 76   0.93388033 -0.315878681
## 77   1.30442768  0.030080677
## 78   1.36606600 -0.385504872
## 79   0.74120785  0.154788829
## 80   0.09823040  1.027682646
## 81   0.27891782  1.599274503
## 82   0.17594741  1.606664291
## 83   0.36211946  0.726339119
## 84   1.13052928  0.583859684
## 85   0.28892996  0.226368291
## 86   0.49503334 -0.885318287
## 87   1.08659598 -0.581099441
## 88   1.18397246  1.456428797
## 89   0.16679916  0.146170848
## 90   0.40465456  1.338171087
## 91   0.39906305  1.091427772
## 92   0.70903141 -0.105852971
## 93   0.46389004  0.957178420
## 94  -0.23098983  2.132630823
## 95   0.39648578  0.809743029
## 96   0.19921519  0.099284236
## 97   0.32879173  0.311682303
## 98   0.67132654  0.078845991
## 99  -0.33398735  1.581218162
## 100  0.36170012  0.536454487
## 101  1.71009852 -0.894052241
## 102  1.17750703  0.665311848
## 103  2.09602938 -0.583674198
## 104  1.45686536 -0.020733757
## 105  1.79071937 -0.342789495
## 106  2.57579797 -0.789255221
## 107  0.38661731  1.624346446
## 108  2.22872233 -0.443217306
## 109  2.02202320  0.728517750
## 110  2.06255070 -1.845013769
## 111  1.33145478 -0.735101605
## 112  1.60565877  0.390424412
## 113  1.80608901 -0.454476108
## 114  1.27553784  1.185649555
## 115  1.37421871  0.420316127
## 116  1.49802971 -0.708627142
## 117  1.47524518 -0.320188330
## 118  2.24416671 -2.405172717
## 119  3.09265796  0.059744071
## 120  1.40319967  1.827045856
## 121  1.90606835 -0.925524073
## 122  0.97032438  0.535629764
## 123  2.73261448 -0.385540928
## 124  1.36076243  0.448269952
## 125  1.63330806 -1.040666715
## 126  1.89192522 -1.032572862
## 127  1.20180371  0.267191352
## 128  1.04492149 -0.129797369
## 129  1.74510746  0.147351199
## 130  1.83436773 -0.600722165
## 131  2.33505041 -0.256590147
## 132  2.13401639 -2.460027089
## 133  1.79784284  0.143881556
## 134  1.18290593  0.226688357
## 135  1.30221896  0.766418300
## 136  2.56490300 -0.818062253
## 137  1.47058012 -1.077939890
## 138  1.35553596 -0.489475118
## 139  0.95130018 -0.081802256
## 140  1.76458306 -0.703363361
## 141  1.87704707 -0.638641032
## 142  1.77095437 -0.701937529
## 143  1.17750703  0.665311848
## 144  1.91770918 -0.891113793
## 145  1.83877014 -1.054184779
## 146  1.75787163 -0.412513140
## 147  1.58947600  0.915408402
## 148  1.48793628 -0.319178298
## 149  1.29529989 -1.025276257
## 150  0.98752038 -0.044117086
ggplot(data = iris_PCaret, aes(x = PC1, y = PC2, color = iris$Species, label = iris$Species)) +
  geom_text(alpha = 0.8, size = 4) +
  ggtitle("First two PC of Iris data") +
  theme_minimal()

# We can see the results are the same as above

Challenge

Perform a PCA on the Ames housing filtered and unfiltered datasets. How much variance is explained by the top components? What is the difference between including and not including the outlier points?

{: .source}

Solution


numericVarsWoutSale <- numericVars[1:length(numericVars) - 1]
table(complete.cases(ameshousingFilt[,numericVarsWoutSale]))

ames_pca <- prcomp(ameshousingFilt[complete.cases(ameshousingFilt[,numericVarsWoutSale]), numericVarsWoutSale],
                   center = TRUE,
                   scale = TRUE) 

fviz_screeplot(ames_pca, addlabels = TRUE, ylim = c(0, 25))
fviz_pca_var(ames_pca, col.var="contrib", gradient.cols = c("red", "blue", "black"), repel = TRUE)

# Contributions of variables to PC1
fviz_contrib(ames_pca, choice = "var", axes = 1, top = 10)
# Contributions of variables to PC2
fviz_contrib(ames_pca, choice = "var", axes = 2, top = 10)
fviz_pca_ind(ames_pca,label = "none")





table(complete.cases(ameshousing[,numericVarsWoutSale]))
ames_pca2 <- prcomp(ameshousing[complete.cases(ameshousing[,numericVarsWoutSale]), numericVarsWoutSale],
                   center = TRUE,
                   scale = TRUE) 
fviz_screeplot(ames_pca2, addlabels = TRUE, ylim = c(0, 25))
fviz_pca_var(ames_pca2, col.var="contrib", gradient.cols = c("red", "blue", "black"), repel = TRUE)

# Contributions of variables to PC1
fviz_contrib(ames_pca2, choice = "var", axes = 1, top = 10)
# Contributions of variables to PC2
fviz_contrib(ames_pca2, choice = "var", axes = 2, top = 10)
fviz_pca_ind(ames_pca2,label = "none")

{: .output} {: .solution} {: .challenge}